import numpy as np
import pandas as pd
df = pd.read_csv('data/train.csv')
# 'Postal Code' contains null values
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9800 entries, 0 to 9799 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row ID 9800 non-null int64 1 Order ID 9800 non-null object 2 Order Date 9800 non-null object 3 Ship Date 9800 non-null object 4 Ship Mode 9800 non-null object 5 Customer ID 9800 non-null object 6 Customer Name 9800 non-null object 7 Segment 9800 non-null object 8 Country 9800 non-null object 9 City 9800 non-null object 10 State 9800 non-null object 11 Postal Code 9789 non-null float64 12 Region 9800 non-null object 13 Product ID 9800 non-null object 14 Category 9800 non-null object 15 Sub-Category 9800 non-null object 16 Product Name 9800 non-null object 17 Sales 9800 non-null float64 dtypes: float64(2), int64(1), object(15) memory usage: 1.3+ MB
# 'Postal Code' contains null values
df.isnull().sum()
Row ID 0 Order ID 0 Order Date 0 Ship Date 0 Ship Mode 0 Customer ID 0 Customer Name 0 Segment 0 Country 0 City 0 State 0 Postal Code 11 Region 0 Product ID 0 Category 0 Sub-Category 0 Product Name 0 Sales 0 dtype: int64
# Location of null values
df[df['Postal Code'].isnull()]
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2234 | 2235 | CA-2018-104066 | 05/12/2018 | 10/12/2018 | Standard Class | QJ-19255 | Quincy Jones | Corporate | United States | Burlington | Vermont | NaN | East | TEC-AC-10001013 | Technology | Accessories | Logitech ClearChat Comfort/USB Headset H390 | 205.03 |
| 5274 | 5275 | CA-2016-162887 | 07/11/2016 | 09/11/2016 | Second Class | SV-20785 | Stewart Visinsky | Consumer | United States | Burlington | Vermont | NaN | East | FUR-CH-10000595 | Furniture | Chairs | Safco Contoured Stacking Chairs | 715.20 |
| 8798 | 8799 | US-2017-150140 | 06/04/2017 | 10/04/2017 | Standard Class | VM-21685 | Valerie Mitchum | Home Office | United States | Burlington | Vermont | NaN | East | TEC-PH-10002555 | Technology | Phones | Nortel Meridian M5316 Digital phone | 1294.75 |
| 9146 | 9147 | US-2017-165505 | 23/01/2017 | 27/01/2017 | Standard Class | CB-12535 | Claudia Bergmann | Corporate | United States | Burlington | Vermont | NaN | East | TEC-AC-10002926 | Technology | Accessories | Logitech Wireless Marathon Mouse M705 | 99.98 |
| 9147 | 9148 | US-2017-165505 | 23/01/2017 | 27/01/2017 | Standard Class | CB-12535 | Claudia Bergmann | Corporate | United States | Burlington | Vermont | NaN | East | OFF-AR-10003477 | Office Supplies | Art | 4009 Highlighters | 8.04 |
| 9148 | 9149 | US-2017-165505 | 23/01/2017 | 27/01/2017 | Standard Class | CB-12535 | Claudia Bergmann | Corporate | United States | Burlington | Vermont | NaN | East | OFF-ST-10001526 | Office Supplies | Storage | Iceberg Mobile Mega Data/Printer Cart | 1564.29 |
| 9386 | 9387 | US-2018-127292 | 19/01/2018 | 23/01/2018 | Standard Class | RM-19375 | Raymond Messe | Consumer | United States | Burlington | Vermont | NaN | East | OFF-PA-10000157 | Office Supplies | Paper | Xerox 191 | 79.92 |
| 9387 | 9388 | US-2018-127292 | 19/01/2018 | 23/01/2018 | Standard Class | RM-19375 | Raymond Messe | Consumer | United States | Burlington | Vermont | NaN | East | OFF-PA-10001970 | Office Supplies | Paper | Xerox 1881 | 12.28 |
| 9388 | 9389 | US-2018-127292 | 19/01/2018 | 23/01/2018 | Standard Class | RM-19375 | Raymond Messe | Consumer | United States | Burlington | Vermont | NaN | East | OFF-AP-10000828 | Office Supplies | Appliances | Avanti 4.4 Cu. Ft. Refrigerator | 542.94 |
| 9389 | 9390 | US-2018-127292 | 19/01/2018 | 23/01/2018 | Standard Class | RM-19375 | Raymond Messe | Consumer | United States | Burlington | Vermont | NaN | East | OFF-EN-10001509 | Office Supplies | Envelopes | Poly String Tie Envelopes | 2.04 |
| 9741 | 9742 | CA-2016-117086 | 08/11/2016 | 12/11/2016 | Standard Class | QJ-19255 | Quincy Jones | Corporate | United States | Burlington | Vermont | NaN | East | FUR-BO-10004834 | Furniture | Bookcases | Riverside Palais Royal Lawyers Bookcase, Royal... | 4404.90 |
# Check if there is postal codes for the same location in other parts of the data
df[(df['Country'] == 'United States') & (df['City'] == 'Burlington') & (df['State'] == 'Vermont')]
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2234 | 2235 | CA-2018-104066 | 05/12/2018 | 10/12/2018 | Standard Class | QJ-19255 | Quincy Jones | Corporate | United States | Burlington | Vermont | NaN | East | TEC-AC-10001013 | Technology | Accessories | Logitech ClearChat Comfort/USB Headset H390 | 205.03 |
| 5274 | 5275 | CA-2016-162887 | 07/11/2016 | 09/11/2016 | Second Class | SV-20785 | Stewart Visinsky | Consumer | United States | Burlington | Vermont | NaN | East | FUR-CH-10000595 | Furniture | Chairs | Safco Contoured Stacking Chairs | 715.20 |
| 8798 | 8799 | US-2017-150140 | 06/04/2017 | 10/04/2017 | Standard Class | VM-21685 | Valerie Mitchum | Home Office | United States | Burlington | Vermont | NaN | East | TEC-PH-10002555 | Technology | Phones | Nortel Meridian M5316 Digital phone | 1294.75 |
| 9146 | 9147 | US-2017-165505 | 23/01/2017 | 27/01/2017 | Standard Class | CB-12535 | Claudia Bergmann | Corporate | United States | Burlington | Vermont | NaN | East | TEC-AC-10002926 | Technology | Accessories | Logitech Wireless Marathon Mouse M705 | 99.98 |
| 9147 | 9148 | US-2017-165505 | 23/01/2017 | 27/01/2017 | Standard Class | CB-12535 | Claudia Bergmann | Corporate | United States | Burlington | Vermont | NaN | East | OFF-AR-10003477 | Office Supplies | Art | 4009 Highlighters | 8.04 |
| 9148 | 9149 | US-2017-165505 | 23/01/2017 | 27/01/2017 | Standard Class | CB-12535 | Claudia Bergmann | Corporate | United States | Burlington | Vermont | NaN | East | OFF-ST-10001526 | Office Supplies | Storage | Iceberg Mobile Mega Data/Printer Cart | 1564.29 |
| 9386 | 9387 | US-2018-127292 | 19/01/2018 | 23/01/2018 | Standard Class | RM-19375 | Raymond Messe | Consumer | United States | Burlington | Vermont | NaN | East | OFF-PA-10000157 | Office Supplies | Paper | Xerox 191 | 79.92 |
| 9387 | 9388 | US-2018-127292 | 19/01/2018 | 23/01/2018 | Standard Class | RM-19375 | Raymond Messe | Consumer | United States | Burlington | Vermont | NaN | East | OFF-PA-10001970 | Office Supplies | Paper | Xerox 1881 | 12.28 |
| 9388 | 9389 | US-2018-127292 | 19/01/2018 | 23/01/2018 | Standard Class | RM-19375 | Raymond Messe | Consumer | United States | Burlington | Vermont | NaN | East | OFF-AP-10000828 | Office Supplies | Appliances | Avanti 4.4 Cu. Ft. Refrigerator | 542.94 |
| 9389 | 9390 | US-2018-127292 | 19/01/2018 | 23/01/2018 | Standard Class | RM-19375 | Raymond Messe | Consumer | United States | Burlington | Vermont | NaN | East | OFF-EN-10001509 | Office Supplies | Envelopes | Poly String Tie Envelopes | 2.04 |
| 9741 | 9742 | CA-2016-117086 | 08/11/2016 | 12/11/2016 | Standard Class | QJ-19255 | Quincy Jones | Corporate | United States | Burlington | Vermont | NaN | East | FUR-BO-10004834 | Furniture | Bookcases | Riverside Palais Royal Lawyers Bookcase, Royal... | 4404.90 |
# Check the type of values in 'Postal Code'
df['Postal Code'].unique()
array([42420., 90036., 33311., 90032., 28027., 98103., 76106., 53711.,
84084., 94109., 68025., 19140., 84057., 90049., 77095., 75080.,
77041., 60540., 32935., 55122., 48185., 19901., 47150., 10024.,
12180., 90004., 60610., 85234., 22153., 10009., 49201., 38109.,
77070., 35601., 94122., 27707., 60623., 29203., 55901., 55407.,
97206., 55106., 80013., 28205., 60462., 10035., 50322., 43229.,
37620., 19805., 61701., 85023., 95661., 64055., 91104., 43055.,
53132., 85254., 95123., 98105., 98115., 73034., 90045., 19134.,
88220., 78207., 77036., 62521., 71203., 6824., 75051., 92374.,
45011., 7090., 19120., 44312., 80219., 75220., 37064., 90604.,
48601., 44256., 43017., 48227., 38401., 33614., 95051., 55044.,
92037., 77506., 94513., 27514., 7960., 45231., 94110., 90301.,
33319., 80906., 7109., 48180., 8701., 22204., 80004., 7601.,
33710., 19143., 90805., 92345., 37130., 84041., 78745., 1852.,
31907., 6040., 78550., 85705., 62301., 2038., 33024., 98198.,
61604., 89115., 2886., 33180., 28403., 92646., 40475., 80027.,
1841., 39212., 48187., 10801., 28052., 32216., 47201., 13021.,
73071., 94521., 60068., 79109., 11757., 90008., 92024., 77340.,
14609., 72701., 92627., 80134., 30318., 64118., 59405., 48234.,
33801., 36116., 85204., 60653., 54302., 45503., 92804., 98270.,
97301., 78041., 75217., 43123., 10011., 48126., 31088., 94591.,
92691., 48307., 7060., 85635., 98661., 60505., 76017., 40214.,
75081., 44105., 75701., 27217., 22980., 19013., 27511., 32137.,
10550., 48205., 33012., 11572., 92105., 60201., 48183., 55016.,
71111., 50315., 93534., 23223., 28806., 92530., 68104., 98026.,
92704., 53209., 41042., 44052., 7036., 93905., 8901., 17602.,
3301., 21044., 75043., 6360., 22304., 43615., 87401., 92503.,
90503., 78664., 92054., 33433., 23464., 92563., 28540., 52601.,
98502., 20016., 65109., 63376., 61107., 33142., 78521., 10701.,
94601., 28110., 20735., 30076., 72401., 47374., 94509., 33030.,
46350., 48911., 44221., 89502., 22801., 92025., 48073., 20852.,
33065., 14215., 33437., 39503., 93727., 27834., 11561., 35630.,
31204., 52402., 2908., 81001., 94533., 32725., 42071., 6457.,
11520., 90660., 84604., 84062., 30080., 24153., 44134., 36608.,
2740., 75061., 8360., 85301., 14304., 27360., 92683., 38301.,
75019., 91767., 89031., 18103., 19711., 85281., 92677., 8302.,
2149., 13601., 54915., 98006., 75002., 79907., 76051., 75007.,
37167., 98031., 70506., 97224., 60076., 75023., 23434., 46203.,
7002., 28314., 27405., 21215., 53142., 66062., 98002., 74133.,
97756., 27604., 74403., 6450., 42104., 46614., 6010., 89015.,
99207., 76248., 45014., 32127., 97504., 22901., 59801., 33178.,
29501., 97477., 32712., 19601., 80020., 65807., 7501., 73120.,
23320., 79424., 65203., 37604., 36830., 92404., 1453., 59715.,
85345., 44107., 8861., 91761., 91730., 56560., 75150., 95207.,
32174., 94086., 3820., 17403., 77840., 63116., 2169., 95336.,
44240., 76903., 84106., 35810., 37918., 72209., 48146., 43302.,
80122., nan, 4401., 38671., 47362., 48640., 57103., 80525.,
47905., 37042., 95823., 91360., 2148., 1040., 87105., 89431.,
92236., 60126., 7055., 29406., 23602., 14701., 46544., 43402.,
92253., 32303., 37211., 98226., 60098., 76117., 60090., 29483.,
71901., 80112., 43130., 88001., 35244., 75034., 95687., 84107.,
53186., 93309., 33068., 45373., 78415., 90278., 32839., 7050.,
70601., 60035., 11550., 46060., 55124., 29464., 48310., 54703.,
78577., 59102., 97030., 37421., 83642., 92307., 60440., 55369.,
95695., 77489., 77581., 94403., 49505., 93277., 66212., 92592.,
92399., 2151., 77301., 60477., 52001., 48127., 87505., 28601.,
60188., 56301., 33161., 46226., 33317., 34952., 29730., 79762.,
53214., 91911., 66502., 16602., 80229., 61821., 47401., 71854.,
78539., 77520., 46142., 90712., 2895., 54880., 76021., 98042.,
74012., 33023., 33021., 77536., 67212., 78501., 52240., 83704.,
2920., 61032., 77642., 95610., 75056., 98052., 32114., 86442.,
46368., 58103., 46514., 91776., 33063., 30328., 44060., 73505.,
23666., 13440., 54601., 83501., 39401., 94526., 48858., 84321.,
6708., 30605., 4240., 61832., 85323., 30062., 85364., 54401.,
99301., 60302., 32503., 77573., 20877., 84043., 35401., 92553.,
40324., 80538., 85224., 59601., 63122., 76706., 48066., 60423.,
18018., 55113., 68801., 55125., 48237., 72756., 88101., 33458.,
93101., 75104., 68701., 84020., 48104., 91941., 83201., 49423.,
6460., 60089., 92630., 96003., 95928., 13501., 72032., 82001.,
42301., 83605., 70065., 3060., 38134., 94061., 37087., 93454.,
60016., 98632., 37075., 50701., 2138., 60067., 1915., 97405.,
93030., 98059., 60025., 33445., 80022., 77590., 27893., 87124.,
27534., 98208., 90640., 92020., 77705., 33407., 79605., 61761.,
63301., 60174., 93010., 97123., 91505., 95351., 67846., 8401.,
80501., 95616., 26003., 95037., 7011., 53081., 30344., 57701.,
1810., 34741., 6484., 6810., 52302., 32771., 78666., 80634.,
76063., 44035., 83301., 33134., 60441., 1752., 20707., 77803.,
71603., 57401., 21740., 7017., 60004., 60543., 55433., 92672.,
94568., 93405., 72762.])
# Update postal code with any value not in the present in the data
df.fillna(0)
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CA-2017-152156 | 08/11/2017 | 11/11/2017 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 |
| 1 | 2 | CA-2017-152156 | 08/11/2017 | 11/11/2017 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 |
| 2 | 3 | CA-2017-138688 | 12/06/2017 | 16/06/2017 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | California | 90036.0 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 |
| 3 | 4 | US-2016-108966 | 11/10/2016 | 18/10/2016 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 |
| 4 | 5 | US-2016-108966 | 11/10/2016 | 18/10/2016 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9795 | 9796 | CA-2017-125920 | 21/05/2017 | 28/05/2017 | Standard Class | SH-19975 | Sally Hughsby | Corporate | United States | Chicago | Illinois | 60610.0 | Central | OFF-BI-10003429 | Office Supplies | Binders | Cardinal HOLDit! Binder Insert Strips,Extra St... | 3.7980 |
| 9796 | 9797 | CA-2016-128608 | 12/01/2016 | 17/01/2016 | Standard Class | CS-12490 | Cindy Schnelling | Corporate | United States | Toledo | Ohio | 43615.0 | East | OFF-AR-10001374 | Office Supplies | Art | BIC Brite Liner Highlighters, Chisel Tip | 10.3680 |
| 9797 | 9798 | CA-2016-128608 | 12/01/2016 | 17/01/2016 | Standard Class | CS-12490 | Cindy Schnelling | Corporate | United States | Toledo | Ohio | 43615.0 | East | TEC-PH-10004977 | Technology | Phones | GE 30524EE4 | 235.1880 |
| 9798 | 9799 | CA-2016-128608 | 12/01/2016 | 17/01/2016 | Standard Class | CS-12490 | Cindy Schnelling | Corporate | United States | Toledo | Ohio | 43615.0 | East | TEC-PH-10000912 | Technology | Phones | Anker 24W Portable Micro USB Car Charger | 26.3760 |
| 9799 | 9800 | CA-2016-128608 | 12/01/2016 | 17/01/2016 | Standard Class | CS-12490 | Cindy Schnelling | Corporate | United States | Toledo | Ohio | 43615.0 | East | TEC-AC-10000487 | Technology | Accessories | SanDisk Cruzer 4 GB USB Flash Drive | 10.3840 |
9800 rows × 18 columns
# Sort dataframe by date
df.sort_values(by=['Order Date'], inplace = True)
# Change data type of 'Order Date' and 'Ship Date' columns
df['Order Date'] = pd.to_datetime(df['Order Date'],format='%d/%m/%Y')
df['Ship Date'] = pd.to_datetime(df['Ship Date'],format='%d/%m/%Y')
# Split 'Order Date' and 'Ship Date' columns into respective composition
df['order_day'] = df['Order Date'].dt.day
df['order_which_day'] = df['Order Date'].dt.day_name()
df['order_month'] = df['Order Date'].dt.month_name()
df['order_year'] = df['Order Date'].dt.year
df['Ship_day'] = df['Ship Date'].dt.day
df['Ship_which_day'] = df['Ship Date'].dt.day_name()
df['Ship_month'] = df['Ship Date'].dt.month_name()
df['Ship_year'] = df['Ship Date'].dt.year
# Total sales per year
import plotly.express as px
fig = px.box(df, x = 'order_year', y ="Sales", color = 'order_year', color_discrete_sequence = px.colors.qualitative.Set3)
fig.update_layout(plot_bgcolor = 'black',
paper_bgcolor = 'black',
font_color = 'white',
title = 'Sales Per Year',
xaxis_title = 'Year',
title_x = 0.5,
yaxis_title = 'Sales')
fig.update_xaxes(showgrid=False, zeroline=False)
fig.update_yaxes(showgrid=False, zeroline=False)
fig.show()
# Find lead time per order and average lead time overall
df['Lead_Time'] = (df['Ship Date'] - df['Order Date']).dt.days
Average_lead_time = df['Lead_Time'].sum()/len(df['Lead_Time'])
print(Average_lead_time)
3.9611224489795918
# Lead time per order
from plotly.graph_objects import Layout
fig = px.histogram(df, x = 'Lead_Time' , color_discrete_sequence=px.colors.qualitative.Set3)
layout = Layout(plot_bgcolor='rgba(0,0,0,0)')
fig.layout
fig.update_layout(
title = 'Lead Time (Average {:.2f} days)'.format(Average_lead_time),
xaxis_title = 'Days',
title_x = 0.5,
yaxis_title = 'No.of Orders',
template="plotly_dark")
fig.update_layout(hovermode="x")
# Total/Average sales per momth
from sort_dataframeby_monthorweek import Sort_Dataframeby_Month, Sort_Dataframeby_Weekday
df_mon_day_mean = df.groupby(['order_year','order_month'], as_index=False)['Sales'].mean()
df_mon_day_sum = df.groupby(['order_year','order_month'], as_index=False)['Sales'].sum()
df_sorted_mon_date_mean = Sort_Dataframeby_Month(df = df_mon_day_mean, monthcolumnname='order_month')
df_sorted_mon_date_sum = Sort_Dataframeby_Month(df = df_mon_day_sum, monthcolumnname='order_month')
# Create date column for plotting graph
df_sorted_mon_date_mean['DATE'] = pd.to_datetime(df_sorted_mon_date_mean['order_year'].astype(str) + '/' + df_sorted_mon_date_mean['order_month'].astype(str) + '/01')
df_sorted_mon_date_sum['DATE'] = pd.to_datetime(df_sorted_mon_date_sum['order_year'].astype(str) + '/' + df_sorted_mon_date_sum['order_month'].astype(str) + '/01')
# Total sales over time
fig = px.line(df_sorted_mon_date_sum, x = 'DATE', y = 'Sales', color = 'order_year', color_discrete_sequence=px.colors.qualitative.Set3)
layout = Layout(plot_bgcolor='rgba(0,0,0,0)')
fig.layout
fig.update_layout(
title = 'Total Sales Over Time',
xaxis_title = 'Order Date',
yaxis_title = 'Sales',
legend_title = "Year",
template="plotly_dark")
fig.update_traces(mode="markers+lines", hovertemplate=None)
fig.update_layout(hovermode="x")
# Total sales per month
fig = px.line(df_sorted_mon_date_sum, x = 'order_month', y = 'Sales', color = 'order_year', color_discrete_sequence=px.colors.qualitative.Set3)
layout = Layout(plot_bgcolor='rgba(0,0,0,0)')
fig.layout
fig.update_layout(
title = 'Total Sales Per Month',
title_x = 0.5,
xaxis_title = 'Month',
yaxis_title = 'Sales',
legend_title = "Year",
template="plotly_dark")
fig.update_traces(mode="markers+lines", hovertemplate=None)
fig.update_layout(hovermode="x")
# Average sales per year
average_sales_per_year = df['Sales'].sum()/len(df['order_year'].unique())
print(average_sales_per_year)
565384.195675
# Average sales over time
fig = px.line(df_sorted_mon_date_mean, x = 'DATE', y = 'Sales', color = 'order_year', color_discrete_sequence=px.colors.qualitative.Set3)
layout = Layout(plot_bgcolor='rgba(0,0,0,0)')
fig.layout
fig.update_layout(title = 'Average Sales Over Time (US${:.2f} Per Year)'.format(average_sales_per_year),
title_x = 0.5,
xaxis_title = 'Order Date',
yaxis_title = 'Sales',
legend_title = "Year",
template="plotly_dark")
fig.update_traces(mode="markers+lines", hovertemplate=None)
fig.update_layout(hovermode="x")
# Average sales per month
fig = px.line(df_sorted_mon_date_mean, x = 'order_month', y = 'Sales', color = 'order_year', color_discrete_sequence=px.colors.qualitative.Set3)
layout = Layout(plot_bgcolor='rgba(0,0,0,0)')
fig.layout
fig.update_layout(title = 'Average Sales Per Month',
title_x = 0.5,
xaxis_title = 'Month',
yaxis_title = 'Sales',
legend_title = "Year",
template="plotly_dark")
fig.update_traces(mode="markers+lines", hovertemplate=None)
fig.update_layout(hovermode="x")
# Total/Average sales per day of week
df_mon_wday_mean = df.groupby(['order_year','order_month','order_which_day'], as_index=False)['Sales'].mean()
df_mon_wday_mean = Sort_Dataframeby_Month(df = df_mon_wday_mean, monthcolumnname='order_month')
df_mon_wday_mean = Sort_Dataframeby_Weekday(df = df_mon_wday_mean, Weekdaycolumnname='order_which_day')
df_mon_wday_sum = df.groupby(['order_year','order_month','order_which_day'], as_index=False)['Sales'].sum()
df_mon_wday_sum = Sort_Dataframeby_Month(df = df_mon_wday_sum, monthcolumnname='order_month')
df_mon_wday_sum = Sort_Dataframeby_Weekday(df = df_mon_wday_sum, Weekdaycolumnname='order_which_day')
# Total sales per day of week
import plotly.express as px
from plotly.graph_objects import Layout
fig = px.line(df_mon_wday_sum, x = 'order_which_day', y = 'Sales', color = 'order_month', facet_row="order_year", color_discrete_sequence=px.colors.qualitative.Set3)
layout = Layout(plot_bgcolor='rgba(0,0,0,0)')
fig.layout
fig.update_layout(title = 'Total Sales Per Day of Week',
title_x = 0.45,
xaxis_title = 'Day of Week',
yaxis_title = 'Sales',
legend_title = "Month",
template="plotly_dark")
fig.for_each_yaxis(lambda y: y.update(title = ''))
fig.add_annotation(x = -0.08 ,y = 0.5,
showarrow = False,
text="Average Sales", textangle=-90,
xref="paper", yref="paper")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_traces(mode="markers+lines", hovertemplate=None)
fig.update_layout(hovermode="x")
# Average sales per day of week
fig = px.line(df_mon_wday_mean, x = 'order_which_day', y = 'Sales', color = 'order_month', facet_row="order_year", color_discrete_sequence=px.colors.qualitative.Set3)
layout = Layout(plot_bgcolor='rgba(0,0,0,0)')
fig.layout
fig.update_layout(title = 'Average Sales Per Day of Week',
title_x = 0.45,
xaxis_title = 'Day of Week',
yaxis_title = 'Sales',
legend_title = "Month",
template="plotly_dark")
fig.for_each_yaxis(lambda y: y.update(title = ''))
fig.add_annotation(x = -0.08 ,y = 0.5,
showarrow = False,
text="Average Sales", textangle=-90,
xref="paper", yref="paper")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_traces(mode="markers+lines", hovertemplate=None)
fig.update_layout(hovermode="x")
# Source: https://gist.github.com/rogerallen/1583593
# List of US states and their abbreviations
us_state_to_abbrev = {
"Alabama": "AL",
"Alaska": "AK",
"Arizona": "AZ",
"Arkansas": "AR",
"California": "CA",
"Colorado": "CO",
"Connecticut": "CT",
"Delaware": "DE",
"Florida": "FL",
"Georgia": "GA",
"Hawaii": "HI",
"Idaho": "ID",
"Illinois": "IL",
"Indiana": "IN",
"Iowa": "IA",
"Kansas": "KS",
"Kentucky": "KY",
"Louisiana": "LA",
"Maine": "ME",
"Maryland": "MD",
"Massachusetts": "MA",
"Michigan": "MI",
"Minnesota": "MN",
"Mississippi": "MS",
"Missouri": "MO",
"Montana": "MT",
"Nebraska": "NE",
"Nevada": "NV",
"New Hampshire": "NH",
"New Jersey": "NJ",
"New Mexico": "NM",
"New York": "NY",
"North Carolina": "NC",
"North Dakota": "ND",
"Ohio": "OH",
"Oklahoma": "OK",
"Oregon": "OR",
"Pennsylvania": "PA",
"Rhode Island": "RI",
"South Carolina": "SC",
"South Dakota": "SD",
"Tennessee": "TN",
"Texas": "TX",
"Utah": "UT",
"Vermont": "VT",
"Virginia": "VA",
"Washington": "WA",
"West Virginia": "WV",
"Wisconsin": "WI",
"Wyoming": "WY",
"District of Columbia": "DC",
"American Samoa": "AS",
"Guam": "GU",
"Northern Mariana Islands": "MP",
"Puerto Rico": "PR",
"United States Minor Outlying Islands": "UM",
"U.S. Virgin Islands": "VI",
}
# Create total sales per state column
state_df = df[['State', 'Sales']].copy()
state_df['State_abbr']= state_df['State'].map(lambda x: us_state_to_abbrev.get(x, x))
sales_by_state = state_df.groupby(['State', 'State_abbr'], as_index=False).sum()
# Sales by state
import plotly.graph_objects as go
fig = go.Figure(data=go.Choropleth(
locations = sales_by_state["State_abbr"], # Spatial coordinates
text = sales_by_state['State'],
z = sales_by_state['Sales'], # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'Reds',
colorbar_title = "Sales",
))
fig.update_layout(title = 'Total Sales by State',
geo_scope='usa', # limite map scope to USA
plot_bgcolor = 'black',
paper_bgcolor = 'black',
font_color = 'white',
geo = dict(bgcolor= 'rgba(0,0,0,0)'),
title_x = 0.5
)
fig.show();
# Sales per category
fig = px.sunburst(df, path=['Segment', 'Category', 'Sub-Category'], values='Sales', color = 'Category',
color_discrete_sequence = px.colors.qualitative.Set3)
fig.update_traces(leaf_opacity= 1)
fig.update_layout(paper_bgcolor = 'black')
fig.update_layout(plot_bgcolor = 'black',
paper_bgcolor = 'black',
font_color = 'white',
title = 'Sales Per Category',
title_x = 0.5)
fig.show()
# Sales contributed per state
fig = px.treemap(df, path=[px.Constant('Total Sales'), 'State', 'Segment', 'Category', 'Sub-Category'], values='Sales', color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_layout(plot_bgcolor = 'black',
paper_bgcolor = 'black',
font_color = 'white',
title = 'Overview of Sales Contributed Per State',
title_x = 0.5)
fig.data[0]['textfont']['color'] = "white"
fig.show()
# Sales by city
Top_cities = df.groupby(["City"], as_index = False)['Sales'].sum().sort_values("Sales", ascending=False).head(20) # Sort the States as per the sales
Top_cities.reset_index(inplace=True)
Top_cities
| index | City | Sales | |
|---|---|---|---|
| 0 | 327 | New York City | 252462.5470 |
| 1 | 265 | Los Angeles | 173420.1810 |
| 2 | 450 | Seattle | 116106.3220 |
| 3 | 436 | San Francisco | 109041.1200 |
| 4 | 372 | Philadelphia | 108841.7490 |
| 5 | 207 | Houston | 63956.1428 |
| 6 | 80 | Chicago | 47820.1330 |
| 7 | 435 | San Diego | 47521.0290 |
| 8 | 216 | Jacksonville | 44713.1830 |
| 9 | 123 | Detroit | 42446.9440 |
| 10 | 462 | Springfield | 41827.8100 |
| 11 | 94 | Columbus | 38662.5630 |
| 12 | 328 | Newark | 28448.0490 |
| 13 | 93 | Columbia | 25283.3240 |
| 14 | 215 | Jackson | 24963.8580 |
| 15 | 233 | Lafayette | 24944.2800 |
| 16 | 432 | San Antonio | 21843.5280 |
| 17 | 60 | Burlington | 21668.0820 |
| 18 | 16 | Arlington | 20214.5320 |
| 19 | 109 | Dallas | 20127.9482 |
fig = px.bar(Top_cities, x="City", y="Sales", title="Sales Contributed Per City", color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_layout(title_x = 0.45,
xaxis_title = 'Day of Week',
yaxis_title = 'Sales',
plot_bgcolor = 'black',
paper_bgcolor = 'black',
font_color = 'white')
fig.update_xaxes(showgrid=False, tickangle=270)
fig.update_yaxes(showgrid=False)
fig.show()
Sales revenue generally increases over time
Average \$565k per year in revenue
Highest sales revenue in November
On average, people tend to spend more on Thursday
Sales revenue generated mostly from Consumers
California had the highest total revenue till date of \$446K
New York City had the highest total revenue till date of \$252K
Average 3.96 days lead time per order